{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {
    "id": "QgumzOZ5wgec",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# Model Training with K-Folds Cross-Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "rByuPhg7wgei",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import joblib\n",
    "import mlflow\n",
    "import mlflow.sklearn\n",
    "from mlflow.tracking import MlflowClient\n",
    "import pandas as pd\n",
    "\n",
    "from pathlib import Path\n",
    "from sklearn import ensemble\n",
    "from typing import Dict, Tuple\n",
    "\n",
    "from evidently.metrics import RegressionQualityMetric, RegressionErrorPlot, RegressionErrorDistribution\n",
    "from evidently.metric_preset import DataDriftPreset, RegressionPreset\n",
    "from evidently.pipeline.column_mapping import ColumnMapping\n",
    "from evidently.report import Report\n",
    "\n",
    "from config import MLFLOW_TRACKING_URI, DATA_DIR, FILENAME, REPORTS_DIR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "HiiUl3p8wgej",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "warnings.simplefilter('ignore')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {
    "id": "zw5Tap_Xwgej",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "## Load Data"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {
    "id": "VqGH1Mr6wgej",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "More information about the dataset can be found in UCI machine learning repository: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset\n",
    "\n",
    "Acknowledgement: Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "36Gk-YMhwgek",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Download original dataset with: python src/pipelines/load_data.py \n",
    "\n",
    "raw_data = pd.read_csv(f\"../{DATA_DIR}/{FILENAME}\")\n",
    "\n",
    "raw_data.head()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {
    "id": "dhZOCJZ1wgel",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "## Define column mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "_i8edS6Ewgem",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "target = 'cnt'\n",
    "prediction = 'prediction'\n",
    "datetime = 'dteday'\n",
    "numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']\n",
    "categorical_features = ['season', 'holiday', 'workingday', ]\n",
    "\n",
    "column_mapping = ColumnMapping()\n",
    "column_mapping.target = target\n",
    "column_mapping.prediction = prediction\n",
    "column_mapping.datetime = datetime\n",
    "column_mapping.numerical_features = numerical_features\n",
    "column_mapping.categorical_features = categorical_features"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {
    "id": "q5lW24Xzwgex",
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# Define the comparison windows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "L6PKtAGEwgey",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "start_date_0 = '2011-01-02 00:00:00'\n",
    "end_date_0 = '2011-01-30 23:00:00'\n",
    "\n",
    "experiment_batches = [\n",
    "    \n",
    "    ('2011-01-31 00:00:00','2011-02-06 23:00:00'),\n",
    "    ('2011-02-07 23:00:00','2011-02-13 23:00:00'),\n",
    "    ('2011-02-14 23:00:00','2011-02-20 23:00:00'),\n",
    "    ('2011-02-21 00:00:00','2011-02-27 23:00:00'),\n",
    "    ('2011-02-28 00:00:00','2011-03-06 23:00:00'),  \n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define the Reference data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set datetime index \n",
    "raw_data = raw_data.set_index('dteday')\n",
    "\n",
    "# Define the reference dataset\n",
    "reference = raw_data.loc[start_date_0:end_date_0]\n",
    "\n",
    "print(reference.shape)\n",
    "reference.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "bTHU8eAqwgez",
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# Set up MLFlow Client\n",
    "mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)\n",
    "client = MlflowClient()\n",
    "print(f\"Client tracking uri: {client.tracking_uri}\")\n",
    "\n",
    "# Set experiment name\n",
    "mlflow.set_experiment('Train Model - K-Fold')\n",
    "\n",
    "# Set experiment variables\n",
    "model_path = Path('../models/model.joblib')\n",
    "ref_end_data = end_date_0\n",
    "\n",
    "# Run model train for each batch (K-Fold)\n",
    "for k, date in enumerate(experiment_batches):\n",
    "\n",
    "    print(f\"Train period: {start_date_0} - {ref_end_data}\") \n",
    "    X_train = raw_data.loc[start_date_0:ref_end_data, numerical_features + categorical_features]\n",
    "    y_train = raw_data.loc[start_date_0:ref_end_data, target]\n",
    "    print(\"X_train (reference) dataset shape: \", X_train.shape, y_train.shape)\n",
    "    \n",
    "    print(f\"Test period: {date[0]} - {date[1]}\") \n",
    "    current = raw_data.loc[date[0]:date[1]]\n",
    "    X_test = current.loc[:, numerical_features + categorical_features]\n",
    "    y_test = current[target]\n",
    "    print(\"X_test (current)) dataset shape: \",  X_test.shape, y_test.shape)\n",
    "    \n",
    "    # Update reference end date\n",
    "    ref_end_data = date[1]\n",
    "    \n",
    "    # Train model\n",
    "    regressor = ensemble.RandomForestRegressor(random_state = 0, n_estimators = 50)\n",
    "    regressor.fit(X_train, y_train)\n",
    "    \n",
    "    # Make predictions \n",
    "    ref_prediction = regressor.predict(reference[numerical_features + categorical_features])\n",
    "    reference['prediction'] = ref_prediction\n",
    "    \n",
    "    prediction = regressor.predict(current[numerical_features + categorical_features])\n",
    "    current['prediction'] = prediction\n",
    "    \n",
    "    # Build the model performance report\n",
    "    regression_performance_report = Report(metrics=[\n",
    "        RegressionQualityMetric(),\n",
    "    ])\n",
    "    regression_performance_report.run(\n",
    "        reference_data=reference, \n",
    "        current_data=current,\n",
    "        column_mapping=column_mapping)\n",
    "    \n",
    "    # Extract Metrics from the report\n",
    "    train_report_metrics = regression_performance_report.as_dict()\n",
    "    me = train_report_metrics['metrics'][0]['result']['current']['mean_error']\n",
    "    mae = train_report_metrics['metrics'][0]['result']['current'][\"mean_abs_error\"]\n",
    "    \n",
    "    # Save model and train_report\n",
    "    joblib.dump(regressor, model_path)\n",
    "    model_quality_report_path = f\"../{REPORTS_DIR}/train_report.html\"\n",
    "    regression_performance_report.save_html(model_quality_report_path)\n",
    "\n",
    "    # Start a new MLflow Run\n",
    "    with mlflow.start_run() as run: \n",
    "        \n",
    "        # Show newly created run metadata info\n",
    "        print(\"Experiment id: {}\".format(run.info.experiment_id))\n",
    "        print(\"Run id: {}\".format(run.info.run_id))\n",
    "        print(\"Run name: {}\".format(run.info.run_name))\n",
    "        print('MLFlow tracking uri:', mlflow.get_tracking_uri())\n",
    "        print('MLFlow artifact uri:', mlflow.get_artifact_uri())\n",
    "        \n",
    "        # Log parameters\n",
    "        mlflow.log_param(\"begin\", date[0])\n",
    "        mlflow.log_param(\"end\", date[1])\n",
    "        \n",
    "        # Log metrics\n",
    "        mlflow.log_metric('me', round(me, 3))\n",
    "        mlflow.log_metric('mae', round(mae, 3))\n",
    "        \n",
    "        # Log model \n",
    "        mlflow.log_artifact(model_path)\n",
    "        \n",
    "        # Log the regression_performance_report as an artifact\n",
    "        mlflow.log_artifact(model_quality_report_path)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Implementing K-Fold Cross-Validation Runs Using MLFlow's Nested Runs Feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set up MLFlow Client\n",
    "mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)\n",
    "client = MlflowClient()\n",
    "print(f\"Client tracking uri: {client.tracking_uri}\")\n",
    "\n",
    "# Set experiment name\n",
    "mlflow.set_experiment('Train Model - K-Fold')\n",
    "\n",
    "# Set experiment variables\n",
    "model_path = Path('../models/model.joblib')\n",
    "ref_end_data = end_date_0\n",
    "\n",
    "# Start a new Run (Parent Run)\n",
    "with mlflow.start_run() as run: \n",
    "    \n",
    "    # Update metrics with metrics for each Fold\n",
    "    metrics = {}\n",
    "\n",
    "    # Run model train for each batch (K-Fold)\n",
    "    for k, date in enumerate(experiment_batches):\n",
    "        \n",
    "        # print(k, date[0],  date[1])\n",
    "            \n",
    "        print(f\"Train period: {start_date_0} - {ref_end_data}\") \n",
    "        X_train = raw_data.loc[start_date_0:ref_end_data, numerical_features + categorical_features]\n",
    "        y_train = raw_data.loc[start_date_0:ref_end_data, target]\n",
    "        print(\"X_train (reference) dataset shape: \", X_train.shape, y_train.shape)\n",
    "        \n",
    "        print(f\"Test period: {date[0]} - {date[1]}\") \n",
    "        current = raw_data.loc[date[0]:date[1]]\n",
    "        X_test = current.loc[:, numerical_features + categorical_features]\n",
    "        y_test = current[target]\n",
    "        print(\"X_test (current)) dataset shape: \",  X_test.shape, y_test.shape)\n",
    "        \n",
    "        # Update reference end date\n",
    "        ref_end_data = date[1]\n",
    "\n",
    "        # Train model\n",
    "        regressor = ensemble.RandomForestRegressor(random_state = 0, n_estimators = 50)\n",
    "        regressor.fit(X_train, y_train)\n",
    "        \n",
    "        # Make predictions \n",
    "        ref_prediction = regressor.predict(reference[numerical_features + categorical_features])\n",
    "        reference['prediction'] = ref_prediction\n",
    "        \n",
    "        prediction = regressor.predict(current[numerical_features + categorical_features])\n",
    "        current['prediction'] = prediction \n",
    "        \n",
    "        # Build the model performance report\n",
    "        regression_performance_report = Report(metrics=[\n",
    "            RegressionQualityMetric(),\n",
    "        ])\n",
    "        regression_performance_report.run(\n",
    "            reference_data=reference, \n",
    "            current_data=current,\n",
    "            column_mapping=column_mapping)\n",
    "        \n",
    "        # Extract Metrics from the report\n",
    "        train_report_metrics = regression_performance_report.as_dict()\n",
    "        me = train_report_metrics['metrics'][0]['result']['current']['mean_error']\n",
    "        mae = train_report_metrics['metrics'][0]['result']['current'][\"mean_abs_error\"]\n",
    "        metrics.update({date[1]: {'me': me, 'mae': mae}})\n",
    "        \n",
    "        # Save train_report for (the Fold)\n",
    "        model_quality_report_path = f\"../{REPORTS_DIR}/train_report.html\"\n",
    "        regression_performance_report.save_html(model_quality_report_path)\n",
    "        \n",
    "        # Run a Child Run for each Fold \n",
    "        with mlflow.start_run(run_name=date[1], nested=True) as child_run:\n",
    "            \n",
    "            # Show newly created run metadata info\n",
    "            print(\"Experiment id: {}\".format(run.info.experiment_id))\n",
    "            print(\"Run id: {}\".format(run.info.run_id))\n",
    "            print(\"Run name: {}\".format(run.info.run_name))\n",
    "            print('MLFlow tracking uri:', mlflow.get_tracking_uri())\n",
    "            print('MLFlow artifact uri:', mlflow.get_artifact_uri())\n",
    "            \n",
    "            # Log parameters\n",
    "            mlflow.log_param(\"begin\", date[0])\n",
    "            mlflow.log_param(\"end\", date[1])\n",
    "            \n",
    "            # Log metrics\n",
    "            mlflow.log_metric('me', round(me, 3))\n",
    "            mlflow.log_metric('mae', round(mae, 3))\n",
    "            \n",
    "            # Log the regression_performance_report as an artifact\n",
    "            mlflow.log_artifact(model_quality_report_path)\n",
    "        \n",
    "    # Save model\n",
    "    joblib.dump(regressor, model_path)\n",
    "    \n",
    "    # Log the last batch model as the parent Run model\n",
    "    mlflow.log_artifact(model_path)\n",
    "    \n",
    "    # Log metrics\n",
    "    average_run_merics = pd.DataFrame.from_dict(metrics).T.mean().round(3).to_dict()\n",
    "    mlflow.log_metrics(average_run_merics )"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
